/*
* Copyright 2009-2020, KyTea Development Team
* 
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* 
*     http://www.apache.org/licenses/LICENSE-2.0
* 
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <kytea/string-util.h>
#include <kytea/kytea-util.h>
#include <kytea/kytea-config.h>
#include <kytea/corpus-io.h>
#include <kytea/config.h>
#include <cmath>
#include <cstring>
#include <cstdlib>
#include <fstream>

using namespace kytea;
using namespace std;

// set the type of the input corpus
void KyteaConfig::setIOFormat(const char* str, CorpusFormat & cf) {
    if(!strcmp(str, "full"))      { cf = CORP_FORMAT_FULL; }
    else if(!strcmp(str, "tags"))  { cf = CORP_FORMAT_TAGS; }
    else if(!strcmp(str, "tok"))  { cf = CORP_FORMAT_TOK; }
    else if(!strcmp(str, "part")) { cf = CORP_FORMAT_PART; }
    else if(!strcmp(str, "conf")) { cf = CORP_FORMAT_PROB; }
    else if(!strcmp(str, "prob")) { cf = CORP_FORMAT_PROB; }
    else if(!strcmp(str, "eda"))  { cf = CORP_FORMAT_EDA; }
    else if(!strcmp(str, "raw"))  { cf = CORP_FORMAT_RAW;  }
    else
        THROW_ERROR("Unsupported corpus IO format '" << str << "'");
}


void KyteaConfig::parseTrainCommandLine(int argc, const char ** argv) {
    for(int i = 1; i < argc; i++)
        i += parseTrainArg(argv[i], (i == argc-1?NULL:argv[i+1]));
}


void KyteaConfig::parseRunCommandLine(int argc, const char ** argv) {
    for(int i = 1; i < argc; i++)
        i += parseRunArg(argv[i], (i == argc-1?NULL:argv[i+1]));
}

const string & KyteaConfig::getModelFile() {
    // load the model file if it has not been specified at the command line
    if(model_.length() == 0) {
        if(getenv("KYTEA_MODEL"))
            model_ = getenv("KYTEA_MODEL");
        else {
            model_ = PKGDATADIR;
            model_ += "/model.bin";
        }
    }
    return model_;
}


void KyteaConfig::printUsage() {
    if(onTraining_) {
        // print the training usage
        cerr << 
"train-kytea:" << endl << 
"  A program to train models for KyTea" << endl <<
"" << endl <<
"Input/Output Options: " << endl <<
"  -encode  The text encoding to be used (utf8/euc/sjis; default: utf8)" << endl <<
"  -full    A fully annotated training corpus (multiple possible)" << endl <<
"  -tok     A training corpus that is tokenized with no tags (multiple possible)" << endl <<
"  -part    A partially annotated training corpus (multiple possible)" << endl <<
"  -conf    A confidence annotated training corpus (multiple possible)" << endl <<
"  -feat    A file containing features generated by -featout" << endl <<
"  -dict    A dictionary file (one 'word/pron' entry per line, multiple possible)" << endl <<
"  -subword A file of subword units. This will enable unknown word PE." << endl <<
"  -model   The file to write the trained model to" << endl <<
"  -modtext Print a text model (instead of the default binary)" << endl <<
"  -featout Write the features used in training the model to this file" << endl <<
"Model Training Options (basic)" << endl <<
"  -nows    Don't train a word segmentation model" << endl <<
"  -notags  Skip the training of tagging, do only word segmentation" << endl <<
"  -global  Train the nth tag with a global model (good for POS, bad for PE)" << endl <<
"  -debug   The debugging level during training (0=silent, 1=normal, 2=detailed)" << endl <<
"Model Training Options (for advanced users): " << endl <<
"  -charw   The character window to use for WS (3)" << endl <<
"  -charn   The character n-gram length to use for WS for WS (3)" << endl <<
"  -typew   The character type window to use for WS (3)" << endl <<
"  -typen   The character type n-gram length to use for WS for WS (3)" << endl <<
"  -dictn   Dictionary words greater than -dictn will be grouped together (4)" << endl <<
"  -unkn    Language model n-gram order for unknown words (3)" << endl <<
"  -eps     The epsilon stopping criterion for classifier training" << endl <<
"  -cost    The cost hyperparameter for classifier training" << endl <<
"  -nobias  Don't use a bias value in classifier training" << endl <<
"  -solver  The solver (1=SVM, 7=logistic regression, etc.; default 1,"<<endl<<
"           see LIBLINEAR documentation for more details)" << endl <<
"Format Options (for advanced users): " << endl <<
"  -wordbound The separator for words in full annotation (\" \")" << endl <<
"  -tagbound  The separator for tags in full/partial annotation (\"/\")" << endl <<
"  -elembound The separator for candidates in full/partial annotation (\"&\")" << endl <<
"  -unkbound  Indicates unannotated boundaries in partial annotation (\" \")" << endl <<
"  -skipbound Indicates skipped boundaries in partial annotation (\"?\")" << endl <<
"  -nobound   Indicates non-existence of boundaries in partial annotation (\"-\")" << endl <<
"  -hasbound  Indicates existence of boundaries in partial annotation (\"|\")" << endl << endl;
    }
    else {
        // print the testing usage
        cerr << 
"kytea:" << endl << 
"  KyTea, a word segmentation/pronunciation estimation tool" << endl <<
"" << endl <<
"Analysis Options: " << endl <<
"  -model   The model file to use when analyzing text" << endl <<
"  -nows    Don't do word segmentation (raw input cannot be accepted)" << endl <<
"  -notags  Do only word segmentation, no tagging" << endl <<
"  -notag   Skip the tag of the nth tag (n starts at 1)" << endl <<
"  -nounk   Don't estimate the pronunciation of unknown words" << endl <<
"  -wsconst Specifies character types to not be segmented (e.g. D for digits)" << endl <<
"  -unkbeam The width of the beam to use in beam search for unknown words " << endl <<
"           (default 50, 0 for full search)" << endl <<
"  -debug   The debugging level (0=silent, 1=simple, 2=detailed)" << endl <<
"Format Options: " << endl <<
"  -in      The formatting of the input  (raw/tok/full/part/conf, default raw)" << endl <<
"  -out     The formatting of the output (full/part/conf/eda/tags, default full)" << endl <<
"  -tagmax  The maximum number of tags to print for one word (default 3," << endl <<
"            0 implies no limit)" << endl << 
"  -deftag  A tag for words that cannot be given any tag (for example, "<<endl<<
"           unknown words that contain a character not in the subword dictionary)" << endl << 
"  -unktag  A tag to append to indicate words not in the dictionary" << endl <<
"Format Options (for advanced users): " << endl <<
"  -wordbound The separator for words in full annotation (\" \")" << endl <<
"  -tagbound  The separator for tags in full/partial annotation (\"/\")" << endl <<
"  -elembound The separator for candidates in full/partial annotation (\"&\")" << endl <<
"  -unkbound  Indicates unannotated boundaries in partial annotation (\" \")" << endl <<
"  -skipbound Indicates skipped boundaries in partial annotation (\"?\")" << endl <<
"  -nobound   Indicates non-existence of boundaries in partial annotation (\"-\")" << endl <<
"  -hasbound  Indicates existence of boundaries in partial annotation (\"|\")" << endl << endl;
    }
    exit(1);
}

void KyteaConfig::printVersion() {
    cerr << "KyTea version "<<VERSION<<endl;
    exit(0);
}

void KyteaConfig::ch(const char * n, const char* v) {
    if(!v) {
        cerr << "Stray '" << n << "' argument" << endl << endl;
        printUsage();
    }
}       
// parse a single argument
//  the value argument can be null
//  return 1 if the value was used 0 if not
unsigned KyteaConfig::parseTrainArg(const char * n, const char * v) {
    unsigned r=1;

    if(!strcmp(n, "--help") || !strcmp(n,"-help")) { printUsage(); }
    else if(!strcmp(n, "--version") || !strcmp(n,"-version")) { printVersion(); }

    // general input/output option
    else if(!strcmp(n, "-encode"))   { ch(n,v); setEncoding(v); }
    else if(!strcmp(n, "-debug"))    { ch(n,v); setDebug(util_->parseInt(v)); }

    // input options for training
    else if(!strcmp(n, "-full"))     { ch(n,v); addCorpus(v, CORP_FORMAT_FULL); }
    else if(!strcmp(n, "-tok"))     { ch(n,v); addCorpus(v, CORP_FORMAT_TOK); }
    else if(!strcmp(n, "-part"))     { ch(n,v); addCorpus(v, CORP_FORMAT_PART); }
    else if(!strcmp(n, "-conf"))     { ch(n,v); addCorpus(v, CORP_FORMAT_PROB); }
    else if(!strcmp(n, "-dict"))     { ch(n,v); addDictionary(v); }
    else if(!strcmp(n, "-subword"))  { ch(n,v); addSubwordDict(v); }
    else if(!strcmp(n, "-global"))   { ch(n,v); setGlobal(util_->parseInt(v)-1); }

    // output option for training
    else if(!strcmp(n, "-model"))    { ch(n,v); setModelFile(v); }
    else if(!strcmp(n, "-modtext"))  { setModelFormat('T'); r=0; }
    else if(!strcmp(n, "-featout"))  { ch(n,v); setFeatureOut(v); }
    else if(!strcmp(n, "-feat"))     { ch(n,v); setFeatureIn(v); }
    else if(!strcmp(n, "-numtags"))  { ch(n,v); setNumTags(util_->parseInt(v)); }

    // liblinear options
    else if(!strcmp(n, "-eps"))      { ch(n,v); setEpsilon(util_->parseFloat(v)); }
    else if(!strcmp(n, "-cost"))      { ch(n,v); setCost(util_->parseFloat(v)); }
    else if(!strcmp(n, "-solver"))   { ch(n,v); setSolverType(util_->parseInt(v)); }

    // feature options
    else if(!strcmp(n, "-charw"))    { ch(n,v); setCharWindow(util_->parseInt(v)); }
    else if(!strcmp(n, "-charn"))    { ch(n,v); setCharN(util_->parseInt(v)); }
    else if(!strcmp(n, "-typew"))    { ch(n,v); setTypeWindow(util_->parseInt(v)); }
    else if(!strcmp(n, "-typen"))    { ch(n,v); setTypeN(util_->parseInt(v)); }
    else if(!strcmp(n, "-dictn"))    { ch(n,v); setDictionaryN(util_->parseInt(v)); }
    else if(!strcmp(n, "-unkn"))     { ch(n,v); setUnkN(util_->parseInt(v)); }

    // formatting options
    else if(!strcmp(n, "-wordbound"))     { ch(n,v); setWordBound(v); }
    else if(!strcmp(n, "-tagbound"))      { ch(n,v); setTagBound(v); }
    else if(!strcmp(n, "-elembound"))     { ch(n,v); setElemBound(v); }
    else if(!strcmp(n, "-unkbound"))      { ch(n,v); setUnkBound(v); }
    else if(!strcmp(n, "-nobound"))       { ch(n,v); setNoBound(v); }
    else if(!strcmp(n, "-hasbound"))      { ch(n,v); setHasBound(v); }
    else if(!strcmp(n, "-skipbound"))     { ch(n,v); setSkipBound(v); }

    // whether or not to perform word segmentation, pronunciation estimation
    else if(!strcmp(n, "-nows"))     { setDoWS(false); r=0; }
    else if(!strcmp(n, "-notags"))   { setDoTags(false); r=0; }
    else if(!strcmp(n, "-nobias"))   { setBias(false); r=0; }

    // --- DEPRECATED ---
    // do not use these undocumented options, as they may disappear in the future
    else if(!strcmp(n, "-prob"))     { ch(n,v); addCorpus(v, CORP_FORMAT_PROB); }
    else if(!strcmp(n, "-dicn"))    { ch(n,v); setDictionaryN(util_->parseInt(v)); }
    
    else if(n[0] == '-') {
        cerr << "Invalid argument '" << n << "'" << endl << endl;
        printUsage();
    }

    else { r=0; args_.push_back(n); }
    return r;
}

unsigned KyteaConfig::parseRunArg(const char * n, const char * v) {
    unsigned r=1;

    if(!strcmp(n, "--help") || !strcmp(n,"-help")) { printUsage(); }
    else if(!strcmp(n, "--version") || !strcmp(n,"-version")) { printVersion(); }

    // general input/output option
    else if(!strcmp(n, "-in"))       { ch(n,v); setIOFormat(v, inputForm_);  }
    else if(!strcmp(n, "-out"))      { ch(n,v); setIOFormat(v, outputForm_); }

    // output option for training
    else if(!strcmp(n, "-model"))    { ch(n,v); setModelFile(v); }

    // whether or not to perform word segmentation, pronunciation estimation
    else if(!strcmp(n, "-nows"))     { setDoWS(false); r=0; }
    else if(!strcmp(n, "-wsconst"))  { ch(n,v); setWsConstraint(v); }
    else if(!strcmp(n, "-notags"))   { setDoTags(false); r=0; }
    else if(!strcmp(n, "-notag"))    { 
        ch(n,v); 
        if(util_->parseInt(v) < 1) THROW_ERROR("Illegal setting "<<v<<" for -notag (must be 1 or greater)");
        setDoTag(util_->parseInt(v)-1,false);
    }
    else if(!strcmp(n, "-nounk"))    { setDoUnk(false); r=0; }
    else if(!strcmp(n, "-numtags"))  { ch(n,v); setNumTags(util_->parseInt(v)); }
    else if(!strcmp(n, "-tagmax"))   { ch(n,v); setTagMax(util_->parseInt(v)); }

    // the limit on the number of unknown words to output
    else if(!strcmp(n, "-unktag"))   { ch(n,v); setUnkTag(v); }
    else if(!strcmp(n, "-deftag"))   { ch(n,v); setDefaultTag(v); }
    else if(!strcmp(n, "-unkbeam"))  { ch(n,v); setUnkBeam(util_->parseInt(v)); }
    else if(!strcmp(n, "-debug"))    { ch(n,v); setDebug(util_->parseInt(v)); }

    // formatting options
    else if(!strcmp(n, "-wordbound"))     { ch(n,v); setWordBound(v); }
    else if(!strcmp(n, "-tagbound"))      { ch(n,v); setTagBound(v); }
    else if(!strcmp(n, "-elembound"))     { ch(n,v); setElemBound(v); }
    else if(!strcmp(n, "-unkbound"))      { ch(n,v); setUnkBound(v); }
    else if(!strcmp(n, "-nobound"))       { ch(n,v); setNoBound(v); }
    else if(!strcmp(n, "-hasbound"))      { ch(n,v); setHasBound(v); }
    else if(!strcmp(n, "-skipbound"))     { ch(n,v); setSkipBound(v); }

    else if(n[0] == '-') {
        cerr << "Invalid argument '" << n << "'" << endl << endl;
        printUsage();
    }

    else { r=0; args_.push_back(n); }
    return r;
}

// set the encoding of the StringUtil class and reset all the IOs
void KyteaConfig::setEncoding(const char* str) {
    if(util_)
        delete util_;
    if(!strcmp(str,"utf8")) util_ = new StringUtilUtf8();
    else if(!strcmp(str,"euc")) util_ = new StringUtilEuc();
    else if(!strcmp(str,"sjis")) util_ = new StringUtilSjis();
    else
        THROW_ERROR("Unsupported encoding format '" << str << "'");
}


KyteaConfig::KyteaConfig() : onTraining_(true), debug_(0), util_(0), dicts_(), 
                modelForm_('B'), inputForm_(CORP_FORMAT_DEFAULT),
                outputForm_(CORP_FORMAT_FULL), featStr_(0),
                doWS_(true), doTags_(true), doUnk_(true),
                addFeat_(false), confidence_(0.0), charW_(3), charN_(3), 
                typeW_(3), typeN_(3), dictN_(4), 
                unkN_(3), unkBeam_(50), defTag_("UNK"), unkTag_(),
                bias_(1.0f), eps_(HUGE_VAL), cost_(1.0),
                solverType_(1/*SVM*/),
                wordBound_(" "), tagBound_("/"), elemBound_("&"), unkBound_(" "), 
                noBound_("-"), hasBound_("|"), skipBound_("?"), escape_("\\"), 
                wsConstraint_(""),
                numTags_(0), tagMax_(3) {
    setEncoding("utf8");
}
KyteaConfig::KyteaConfig(const KyteaConfig & rhs) 
              :  onTraining_(rhs.onTraining_), debug_(rhs.debug_), 
                 util_(rhs.util_), dicts_(rhs.dicts_),
                 modelForm_(rhs.modelForm_), inputForm_(rhs.inputForm_), 
                 outputForm_(rhs.outputForm_), featStr_(rhs.featStr_), 
                 doWS_(rhs.doWS_), doTags_(rhs.doTags_), 
                 doUnk_(rhs.doUnk_), addFeat_(rhs.addFeat_), 
                 confidence_(rhs.confidence_), charW_(rhs.charW_), 
                 charN_(rhs.charN_), typeW_(rhs.typeW_), 
                 typeN_(rhs.typeN_), dictN_(rhs.dictN_), 
                 unkN_(rhs.unkN_), unkBeam_(rhs.unkBeam_), 
                 defTag_(rhs.defTag_), unkTag_(rhs.unkTag_), 
                 bias_(rhs.bias_), eps_(rhs.eps_), cost_(rhs.cost_), 
                 solverType_(rhs.solverType_), wordBound_(rhs.wordBound_), 
                 tagBound_(rhs.tagBound_), elemBound_(rhs.elemBound_), 
                 unkBound_(rhs.unkBound_), noBound_(rhs.noBound_), 
                 hasBound_(rhs.hasBound_), skipBound_(rhs.skipBound_), 
                 escape_(rhs.escape_), numTags_(rhs.numTags_), tagMax_(rhs.tagMax_)
{

}

KyteaConfig::~KyteaConfig() {
    if(util_)
        delete util_;
}

void KyteaConfig::addCorpus(const std::string & corp, CorpusFormat format) {
    corpora_.push_back(corp);
    corpusFormats_.push_back(format);
}

void KyteaConfig::addDictionary(const std::string & corp) {
    dicts_.push_back(corp);
}

void KyteaConfig::addSubwordDict(const std::string & corp) {
    subwordDicts_.push_back(corp);
}

const char KyteaConfig::getEncoding() const { return util_->getEncoding(); }
const char* KyteaConfig::getEncodingString() const { return util_->getEncodingString(); }

std::ostream * KyteaConfig::getFeatureOutStream() {
    if(featOut_.length() && !featStr_)
        featStr_ = new std::ofstream(featOut_.c_str());
    return featStr_;
}
void KyteaConfig::closeFeatureOutStream() {
    if(featStr_) {
        delete featStr_;
        featStr_ = 0;
    }
}
